#160713 - based on Blast_v7_taxo.R

setwd("~/Documents/UNI_und_VORLESUNGEN/11 phd projects/1 Meta SCHMALNAU/2 HiSeq biomass 160707/8 taxonomy")

library("seqinr")
fastafile <- "../7 OTUs/D) OTU_KEEP.txt"


DNA <-read.fasta(fastafile, as.string=T)

DNAlength <- nchar(DNA)

#attr(DNA, "name")


# download annotate package!
#source("https://bioconductor.org/biocLite.R")
#biocLite("annotate")
#biocLite("Biostrings")

library("annotate")
library("XML")

# blast data against NCBI!



#for (i in 1:10){
for (i in 146:length(DNA)){
data <- blastSequences(DNA[i] , as="data.frame", hitListSize=100, timeout=4000, filter="") # remove low komplexety filter
Sys.sleep(2)
data <- data[data$Hsp_num==1,]
write.csv(data, file=paste("blast_hits/blast/",attr(DNA[i], "name"), ".csv", sep=""))
}



###################
# download taxonomy!

FOLDER <- "blast_hits/blast"
csvfiles <- list.files(FOLDER, full.names=T, pattern=".csv$")

for (f in 1:length(csvfiles)){
#for (f in 1:10){


if("\"\""!=(readLines(csvfiles[f])[1])){ # check if file is empty!

plotdata <- read.csv(csvfiles[f], stringsAsFactors=F)


Sys.sleep(2)

IDs <- paste(plotdata$Hit_accession, sep="", collapse=",")
IDs <- paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nuccore&RetMax=100&term=", IDs, sep="", collapse="") # download IDs with accession no

IDs <- readLines(IDs)
temp <- xmlToList(IDs)
IDs <- as.vector(unlist(temp$IdList))
Sys.sleep(2)


TaxID <- paste(IDs, sep="", collapse=",")
TaxID <- paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=nuccore&id=", TaxID, sep="", collapse="")
TaxID <- readLines(TaxID)

temp2 <- TaxID[grep("\t<Item Name=\"TaxId\" Type=\"Integer\">", TaxID)]
temp2 <- sub("\t<Item Name=\"TaxId\" Type=\"Integer\">(.*)</Item>", "\\1", temp2) # phrase XML

Sys.sleep(2)

temp2 <- paste(temp2, sep="", collapse=",")
temp2 <- paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=", temp2, sep="", collapse="")

taxonomy <- readLines(temp2)

cat(taxonomy, file=sub("blast_hits/blast", "blast_hits/blast_taxonomy", csvfiles[f]), sep="\n")

}
}



############
# make plots
FOLDER <- "blast_hits/blast"



#make awesome plot!

plottables <- list.files(FOLDER, full.names=T, pattern=".csv$")

for (f in 1:length(plottables)){


if("\"\""!=(readLines(plottables[f])[1])){ # check if file is empty!
	
plotdata <- read.csv(plottables[f], stringsAsFactors=F)


OTU <- sub(paste(FOLDER, "/(.*)", sep=""), "\\1", plottables[f])

pdf(paste("blast_hits", "/", OTU, "_plot.pdf", sep="", collapse=""), width=11, height=15)
par(mar=c(0,0,0,0))
plot(NULL, xlim=c(0,100), ylim=c(0, 125), xlab="", ylab="", xaxt="n", yaxt="n", bty="n")

# calc % identity
ident <- plotdata$Hsp_identity/plotdata$Hsp_align.len

#calc % coverage
tempL <- DNAlength[names(DNAlength)==sub(".csv", "", OTU)]
coverage <- nchar(gsub("-", "", plotdata$Hsp_qseq))/tempL

# plot similarity
rect(49, 104, 101, 126)
plotme <- which(ident>0.8)*0.5+50

if (length(plotme)>0){
rect(plotme-0.25, 105, plotme+0.25, 105+ ident[ident>0.8]*100-80, border=NA, col="Black")
}

# plot text
n <- nrow(plotdata)

# color: ident
simcol <- ident
simcol[as.numeric(ident)<0.90] <- "Red"
simcol[as.numeric(ident)>=0.90] <- "Orange"
simcol[as.numeric(ident)>=0.95] <- "Yellow"
simcol[as.numeric(ident)>=0.98] <- "lightgreen"
simcol[as.numeric(ident)==1] <- "Green"
rect(40, n:1-0.5, 43, n:1-1.5, col= simcol, border=NA)

# color: coverage
simcol <- coverage
simcol[as.numeric(coverage)<0.90] <- "Red"
simcol[as.numeric(coverage)>=0.90] <- "Orange"
simcol[as.numeric(coverage)>=0.95] <- "Yellow"
simcol[as.numeric(coverage)>=0.98] <- "lightgreen"
simcol[as.numeric(coverage)==1] <- "Green"
rect(45, n:1-0.5, 48, n:1-1.5, col= simcol, border=NA)

simcol <- plotdata$Hsp_evalue
simcol[as.numeric(plotdata$Hsp_evalue)>=1e-20] <- "Red"
simcol[as.numeric(plotdata$Hsp_evalue)<1e-20] <- "Green"
rect(50, n:1-0.5, 56, n:1-1.5, col= simcol, border=NA)


# texts
text(0, 100-n:1, substr(plotdata$Hit_def, 1, 50)[n:1], adj=0, cex=0.5)
text(32, 100-n:1, plotdata$Hit_accession[n:1], adj=0, cex=0.5)
text(40, 100-n:1, round(ident[n:1]*100, digits=2), adj=0, cex=0.5) # % match
text(45, 100-n:1, round(coverage[n:1]*100, digits=2), adj=0, cex=0.5)
text(50, 100-n:1, plotdata$Hsp_evalue[n:1], adj=0, cex=0.5)



# taxonomy


taxo <- readLines(sub("blast_hits/blast", "blast_hits/blast_taxonomy", plottables[f]))

start <- grep("    <LineageEx>", taxo)
end <- grep("    </LineageEx>", taxo)

orderA <- c(rep(NA, 100))
famA <- c(rep(NA, 100))
#genusA <- c(rep(NA, 100))

for (i in 1:length(start)){
meep <- taxo[start[i]:end[i]]

order <- sub("            <ScientificName>(.*)</ScientificName>", "\\1", meep[which(meep=="            <Rank>order</Rank>")-1])
if(length(order)!=0){orderA[i] <- order}
fam <- sub("            <ScientificName>(.*)</ScientificName>", "\\1", meep[which(meep=="            <Rank>family</Rank>")-1])
if(length(fam)!=0){famA[i] <- fam}
#genus[i] <- sub("            <ScientificName>(.*)</ScientificName>", "\\1", meep[which(meep=="            <Rank>genus</Rank>")-1])

taxo[(start[i]:end[i])] <- NA # rm phrades elements
}

species <- taxo[grep("    <ScientificName>.*</ScientificName>", taxo)]
species <- sub("    <ScientificName>(.*)</ScientificName>", "\\1", species)

text(60, 100-1:n, orderA, adj=0, cex=0.5)
text(70, 100-1:n, famA, adj=0, cex=0.5)
text(80, 100-1:n, species, adj=0, cex=0.5)



# main text
text(0, 123, OTU, adj=0, cex=1.8)
text(0, 118, species[1], adj=0, cex=1.5)
text(0, 114, famA[1], adj=0, cex=1.5)
text(0, 110, orderA[1], adj=0, cex=1.5)


text(0, 105, round(ident[1]*100, digits=2), adj=0, cex=2)


dev.off()
}


}






